from pymongo import MongoClient
import pymongo
from tokenizer import tokenize
import numpy as np
from gensim.models import Word2Vec
import pickle as pkl

client = MongoClient(port=27017)
db=client['tiktok']
texts = []
ct = 0
for col in db.list_collection_names():
    for obj in db[col].find():
        text=obj['text_feature']['text']
        for item in obj['text_feature']['stickerText']:
            text+=' '+item
        text=tokenize(text)
        if len(text) > 2:
            texts.append(text)

print('Max sentence length:', max([len(text) for text in texts]))
print('Avg sentence length:', sum([len(text) for text in texts]) / len(texts))
print('Min sentence length:', min([len(text) for text in texts]))

model = Word2Vec(texts, min_count=1, size=32)
words = list(model.wv.vocab)

word_index = {}
ct = 1
embedding_matrix = []
embedding_matrix.append(np.zeros(32))
for word in words:
    word_index[word] = ct
    ct += 1
    embedding_matrix.append(model[word])

embedding_matrix = np.array(embedding_matrix)

print("Vocab Size:", len(word_index))
pkl.dump(word_index, open('D:\\Work\\kusuri\\data_pi\\word_index.pkl', 'wb'))
model.save('D:\\Work\\kusuri\\data_pi\\word_embed.bin')
np.save('D:\\Work\\kusuri\\data_pi\\embedding_matrix', np.array(embedding_matrix))

max_val = np.max(embedding_matrix)
min_val = np.min(embedding_matrix)
embedding_matrix = (embedding_matrix - min_val) / (max_val - min_val)
for i in range(32):
    embedding_matrix[0][i] = 0

np.save('D:\\Work\\kusuri\\data_pi\\embedding_matrix_norm', np.array(embedding_matrix))
